/****************************************************************************************************
Do file name: 4_MergeCleaning.do
Author: Thiago Scot
This version: April 16th, 2019
DEscription: Cleans datasets after merging by strings.Output is dataset at Judge level with identifier to be 
merged with productivity data.
****************************************************************************************************/

clear all
set more off

use  "$temp/judges_merged_concurso.dta" 

drop _merge

drop if Estado == "Federal"

merge m:1 Judge_ID using "$temp/judges_matchit_new.dta"
rename _merge _merge2


*Bring judges from productivity data
preserve
	clear
	use "$temp/CnjDataset_clean.dta"																					  
	gduplicates drop Judge_idP, force
	keep Judge_idP State_cod State
	tempfile temp
	save "`temp'"
restore

*Merge and drop those not in Concurso data
merge m:1 Judge_idP using "`temp'"
drop if _merge == 2
drop _merge  Judge


lab def State_cod 11 "Rondônia/RO" 12 "Acre/AC" 13 "Amazonas/AM" 14 "Roraima/RR" 15 "Pará/PA" ///
16 "Amapá/AP" 17 "Tocantins/TO" 21 "Maranhão/MA" 22 "Piauí/PI" 23 "Ceará/CE" 24 "Rio Grande do Norte/RN" ///
25 "Paraíba/PB" 26 "Pernambuco/PE" 27 "Alagoas/AL" 28 "Sergipe/SE" 29 "Bahia/BA" 31 "Minas Gerais/MG" /// 
32 "Espírito Santo/ES" 33 "Rio de Janeiro/RJ" 35 "São Paulo/SP" 41 "Paraná/PR" 42 "Santa Catarina/SC" ///
43 "Rio Grande do Sul/RS" 50 "Mato Grosso do Sul/MS" 51 "Mato Grosso/MT" 52 "Goiás/GO" 53 "Distrito Federal/DF", replace
lab val State_cod State_cod


drop rank _merge_match
*rename classification gender
 
 
*Checking with State of Concurso is same state in which Judge works
gen match_state = (Estado == State) if _merge2 == 3
tab match_state
/* For 94% of observations, we are matching a judge's exam performance in the entrance exam for 
the same state for which we observe performance on the judge. For the remaining 189 observations, in most cases
we observe the judge in more than one exam, so their performance in a different exam is tagged as from a different
state. We will drop these 189 observations (but note this does not mean we drop 189 judges, several of these are 
already identified in the 2,309 judges we match.

match_state |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |        205        6.26        6.26
          1 |      3,068       93.74      100.00
------------+-----------------------------------
      Total |      3,273      100.00
*/


*Generate normalized version of grades 
foreach var of varlist Final_grade Grade_Titles Essay_Exam_Civil Essay_Exam_Penal Objective_Exam Oral_Exam Written_Exam {
	bys Concurso: egen mean_aux = mean(`var')
	bys Concurso: egen sd_aux   = sd(`var')
	gen `var'_z = (`var' - mean_aux)/sd_aux
	
	drop mean_aux sd_aux
}

lab var fullname "Nome na Base do Concurso"
lab var Concurso "Concurso prestado"
lab var Year "Ano do Concurso"
lab var Rank "Ranking final no concurso"
lab var Rank_imp "Ranking imputado (nota final)"
lab var Final_grade "Nota Final no concurso"
lab var Grade_Titles "Nota no exame de titulos"
lab var Essay_Exam "Nota na prova discursiva"
lab var Objective_Exam "Nota na prova objetiva"
lab var Oral_Exam "Nota na prova oral"
lab var Written_Exam "Nota na prova escrita 0"
lab var Written_Exam1 "Nota na prova escrita 1"
lab var Written_Exam2 "Nota na prova escrita 2" 
lab var Essay_Exam_Civil "Nota na prova de sentenca - Civil"
lab var Essay_Exam_Penal "Nota na prova de sentenca - Penal"
lab var Objective_Questions "Nota na prova objetiva 2"
lab var Average_Written "Nota media nas provas escritas"
lab var Specific_Exams "Nota na prova especifica (?)"
lab var Name "Nome (original)"
lab var Number "Numero do candidato no concurso"
lab var ResultadoFinaldata "Data de resultado do Concurso"
lab var Remuneracao "Remuneracao do Concurso"
lab var Vagas "Numero de vagas do Concurso"
lab var Tribunal "Tribunal do Concurso"
lab var Estado "Estado do Concurso"
lab var banca  "Instituicao responsavel pelo concurso"
lab var oral_size "Numero de componentes da Banca Oral"
lab var oral_females "Numero de mulheres da Banca Oral"
lab var Judge_ID "Identificador de Juiz (dados concurso)"
*lab var Judge_id "Identificador de Juiz (produtividade raw)"
lab var Judge1 "Nome do Juiz (base de produtividade)"
lab var similscore "Score de similaridade do nome (match)"
lab var _merge2 "Match Concurso/Produtividade?"
lab var State "Estado (produtividade)"
*lab var gender "Genero do Juiz (match Censo)"
lab var Final_grade_z "Final Grade (standardized)"
lab var Grade_Titles_z "Titles Grade (standardized)"
lab var Essay_Exam_Civil_z "Civil Essay Grade (standardized)"
lab var Essay_Exam_Penal_z "Penal Essay Grade (standardized)"
lab var Objective_Exam_z "Objective Grade (standardized)"
lab var Oral_Exam_z "Oral Grade (standardized)"
lab var Written_Exam_z "Written Exam Grade (standardized)"

drop if match_state == 0								//dropping judges which are wrongly matched by state
keep if _merge2 == 3									//dropping judges not matched by name
drop if fullname == "MARIANA MARINHO 1 MACHADO"			//Dropping one duplicate with incorrect name
drop if fullname == "JOSE HERCY PONTE DE ALENCAR" & 	Concurso == "TJCEJUIZ2004"  //Did not enter 2004 and entered in 2011, both in Ceara
drop if fullname == "JOAO AENDER CAMPOS CREMASCO" & 	Concurso == "TJSP_05_2" 	//Did not enter 2005 and entered 2007

encode Concurso, gen(Concurso_enc)

drop fullname Judge1 Concurso Name  Number index    ///
	 DataExtracted ResultadoFinaldata Remuneracao Vagas Inscritos banca link oral_size oral_females resultados O P Q ///
	Tribunal Estado Judge_ID

order Judge_idP Concurso_enc Year Rank Rank_imp 
sort Concurso_enc Year


saveold "$temp/JudgesDataset_clean.dta", replace
